#This script tabulates the top 10 articles for the high-scoring (cosine similarity) opposition articles for Egypt
#It produces a LaTeX file with the articles for Appendix Section L
import pandas as pd
import re

# File paths
input_csv = "data/output/cos_sims_robustness/high_cos_sim_articles_translated_150000030k.csv"
output_tex = "data/output/cos_sims_robustness/high_cos_sim_articles_translated_150000030k.tex"

# Read the translated articles
df = pd.read_csv(input_csv)

# Ensure necessary columns exist
if "content_translated" not in df.columns or "val" not in df.columns:
    raise ValueError("Required columns 'content_translated' or 'val' not found in the dataset.")

# Function to extract relevant sentences around TARGETWORD
def extract_context(text, keyword="TARGETWORD"):
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Split text into sentences
    target_idx = next((i for i, s in enumerate(sentences) if keyword in s), None)
    
    if target_idx is None:
        return None  # Skip if TARGETWORD is not found

    # Extract the sentence before, target sentence, and the sentence after
    before = sentences[target_idx - 1] if target_idx > 0 else ""
    target = sentences[target_idx]
    after = sentences[target_idx + 1] if target_idx < len(sentences) - 1 else ""

    return f"{before} {target} {after}".strip()

# Open the LaTeX file for writing
with open(output_tex, "w", encoding="utf-8") as f:
    first_entry = True  # Control paragraph flow

    # Loop through articles and process relevant sentences
    for idx, row in df.iterrows():
        content = row["content_translated"]
        score = row["val"]  # Cosine similarity score
        doc_id = str(row.get("doc_id", f"Article {idx+1}")).replace("_", r"\_")  # Escape `_`

        context_text = extract_context(content, keyword="TARGETWORD")
        if context_text is None:
            continue  # Skip if no context found

        # Add a paragraph break only if this is not the first entry
        if not first_entry:
            f.write("\n\n")  
        first_entry = False  # Ensure first entry doesn't add an extra newline

        # Write as inline text
        f.write(f"\\textcolor{{red}}{{{doc_id}: {score:.4f}}} ")
        f.write("\\begin{lstlisting}\n")
        f.write(context_text + "\n")
        f.write("\\end{lstlisting}")

print(f"LaTeX file generated: {output_tex}")
